In [148]:
import datashader as ds
import datashader.transfer_functions as tf
import datashader.glyphs
from datashader import reductions
from datashader.core import bypixel
from datashader.utils import lnglat_to_meters as webm, export_image
from datashader.colors import colormap_select, Greys9, viridis, inferno
import copy


from pyproj import Proj, transform
import numpy as np
import pandas as pd
import urllib
import json
import datetime
import colorlover as cl

import plotly.offline as py
import plotly.graph_objs as go
import plotly.express as px
from plotly import tools

# from shapely.geometry import Point, Polygon, shape
# In order to get shapley, you'll need to run [pip install shapely.geometry] from your terminal

from functools import partial

from IPython.display import GeoJSON


pd.set_option('display.max_columns', None)
pd.set_option('display.max_rows', 500)

py.init_notebook_mode()

For module 2 we'll be looking at techniques for dealing with big data. In particular binning strategies and the datashader library (which possibly proves we'll never need to bin large data for visualization ever again.)

To demonstrate these concepts we'll be looking at the PLUTO dataset put out by New York City's department of city planning. PLUTO contains data about every tax lot in New York City.

PLUTO data can be downloaded from here. Unzip them to the same directory as this notebook, and you should be able to read them in using this (or very similar) code. Also take note of the data dictionary, it'll come in handy for this assignment.

In [149]:
ny = pd.read_csv('pluto_22v2.csv', low_memory=False)


# Getting rid of some outliers
ny = ny[(ny['yearbuilt'] > 1850) & (ny['yearbuilt'] < 2020) & (ny['numfloors'] != 0)]
In [150]:
print(ny.dtypes)
borough                  object
block                     int64
lot                       int64
cd                      float64
bct2020                 float64
bctcb2020               float64
ct2010                  float64
cb2010                  float64
schooldist              float64
council                 float64
zipcode                 float64
firecomp                 object
policeprct              float64
healthcenterdistrict    float64
healtharea              float64
sanitboro               float64
sanitdistrict           float64
sanitsub                 object
address                  object
zonedist1                object
zonedist2                object
zonedist3                object
zonedist4                object
overlay1                 object
overlay2                 object
spdist1                  object
spdist2                  object
spdist3                 float64
ltdheight                object
splitzone                object
bldgclass                object
landuse                 float64
easements               float64
ownertype                object
ownername                object
lotarea                 float64
bldgarea                float64
comarea                 float64
resarea                 float64
officearea              float64
retailarea              float64
garagearea              float64
strgearea               float64
factryarea              float64
otherarea               float64
areasource              float64
numbldgs                float64
numfloors               float64
unitsres                float64
unitstotal              float64
lotfront                float64
lotdepth                float64
bldgfront               float64
bldgdepth               float64
ext                      object
proxcode                float64
irrlotcode               object
lottype                 float64
bsmtcode                float64
assessland              float64
assesstot               float64
exempttot               float64
yearbuilt               float64
yearalter1              float64
yearalter2              float64
histdist                 object
landmark                 object
builtfar                float64
residfar                float64
commfar                 float64
facilfar                float64
borocode                  int64
bbl                     float64
condono                 float64
tract2010               float64
xcoord                  float64
ycoord                  float64
zonemap                  object
zmcode                   object
sanborn                  object
taxmap                  float64
edesignum                object
appbbl                  float64
appdate                  object
plutomapid                int64
firm07_flag             float64
pfirm15_flag            float64
version                  object
dcpedited                object
latitude                float64
longitude               float64
notes                   float64
dtype: object

I'll also do some prep for the geographic component of this data, which we'll be relying on for datashader.

You're not required to know how I'm retrieving the lattitude and longitude here, but for those interested: this dataset uses a flat x-y projection (assuming for a small enough area that the world is flat for easier calculations), and this needs to be projected back to traditional lattitude and longitude.

In [151]:
# wgs84 = Proj("+proj=longlat +ellps=GRS80 +datum=NAD83 +no_defs")
# nyli = Proj("+proj=lcc +lat_1=40.66666666666666 +lat_2=41.03333333333333 +lat_0=40.16666666666666 +lon_0=-74 +x_0=300000 +y_0=0 +ellps=GRS80 +datum=NAD83 +to_meter=0.3048006096012192 +no_defs")
# ny['xcoord'] = 0.3048*ny['xcoord']
# ny['ycoord'] = 0.3048*ny['ycoord']
# ny['lon'], ny['lat'] = transform(nyli, wgs84, ny['xcoord'].values, ny['ycoord'].values)

# ny = ny[(ny['lon'] < -60) & (ny['lon'] > -100) & (ny['lat'] < 60) & (ny['lat'] > 20)]

#Defining some helper functions for DataShader
background = "black"
export = partial(export_image, background = background, export_path="export")
cm = partial(colormap_select, reverse=(background!="black"))
In [152]:
ny.head()
Out[152]:
borough block lot cd bct2020 bctcb2020 ct2010 cb2010 schooldist council zipcode firecomp policeprct healthcenterdistrict healtharea sanitboro sanitdistrict sanitsub address zonedist1 zonedist2 zonedist3 zonedist4 overlay1 overlay2 spdist1 spdist2 spdist3 ltdheight splitzone bldgclass landuse easements ownertype ownername lotarea bldgarea comarea resarea officearea retailarea garagearea strgearea factryarea otherarea areasource numbldgs numfloors unitsres unitstotal lotfront lotdepth bldgfront bldgdepth ext proxcode irrlotcode lottype bsmtcode assessland assesstot exempttot yearbuilt yearalter1 yearalter2 histdist landmark builtfar residfar commfar facilfar borocode bbl condono tract2010 xcoord ycoord zonemap zmcode sanborn taxmap edesignum appbbl appdate plutomapid firm07_flag pfirm15_flag version dcpedited latitude longitude notes
0 SI 1597 125 502.0 5029104.0 5.029104e+10 291.04 3007.0 31.0 50.0 10314.0 E166 121.0 51.0 100.0 5.0 2.0 4E 285 ARLENE STREET R3X NaN NaN NaN NaN NaN NaN NaN NaN NaN N B2 1.0 0.0 NaN ROMAN, ADRIENNE 4000.0 2244.0 0.0 1302.0 0.0 0.0 0.0 0.0 0.0 0.0 2.0 1.0 1.0 2.0 2.0 40.0 100.0 27.00 51.0 G 1.0 N 5.0 1.0 10560.0 56580.0 0.0 1965.0 0.0 0.0 NaN NaN 0.56 0.50 0.0 1.0 5 5.015970e+09 NaN 29104.0 938611.0 161974.0 20d NaN 502 275 50702.0 NaN NaN NaN 1 NaN NaN 22v2 NaN 40.611140 -74.164376 NaN
2 BK 4794 1 309.0 3080600.0 3.080600e+10 806.00 2000.0 17.0 41.0 11203.0 E249 71.0 32.0 4800.0 3.0 9.0 2B 610 KINGSTON AVENUE R6 NaN NaN NaN NaN NaN NaN NaN NaN NaN N S9 4.0 0.0 NaN 529 MAPLE LLC 4000.0 6830.0 1800.0 5030.0 0.0 1800.0 0.0 0.0 0.0 0.0 2.0 2.0 3.0 4.0 7.0 100.0 40.0 42.00 107.0 E 3.0 N 3.0 5.0 8100.0 316800.0 0.0 1899.0 0.0 0.0 NaN NaN 1.71 2.43 0.0 4.8 3 3.047940e+09 NaN 806.0 1000194.0 180391.0 17b NaN 310 049 31506.0 NaN 3.047940e+09 08/12/2005 1 NaN NaN 22v2 NaN 40.661794 -73.942532 NaN
3 BK 1488 105 303.0 3037500.0 3.037500e+10 375.00 1001.0 16.0 41.0 11221.0 E222 81.0 34.0 3100.0 3.0 3.0 5A 932 JEFFERSON AVENUE R6B NaN NaN NaN NaN NaN NaN NaN NaN NaN N B2 1.0 0.0 NaN LEVY, RODNEY K 2025.0 2856.0 0.0 2856.0 0.0 0.0 0.0 0.0 0.0 0.0 2.0 1.0 3.0 2.0 2.0 0.0 0.0 20.25 47.0 N 2.0 N 0.0 5.0 17280.0 58020.0 0.0 1992.0 0.0 0.0 NaN NaN 1.41 2.00 0.0 2.0 3 3.014880e+09 NaN 375.0 1006390.0 189391.0 17a NaN 305 026 30601.0 NaN 3.014880e+09 11/04/1992 1 NaN NaN 22v2 NaN 40.686484 -73.920169 NaN
4 BK 4794 17 309.0 3080600.0 3.080600e+10 806.00 2000.0 17.0 41.0 11203.0 E249 71.0 32.0 4800.0 3.0 9.0 2B 624 EAST NEW YORK AVENUE R6 NaN NaN NaN NaN NaN NaN NaN NaN NaN N C0 2.0 0.0 NaN SEVERE CHARLES 2000.0 2400.0 0.0 2400.0 0.0 0.0 0.0 0.0 0.0 0.0 2.0 1.0 2.0 3.0 3.0 20.0 100.0 20.00 60.0 N 3.0 N 5.0 2.0 20040.0 66420.0 33210.0 1991.0 0.0 0.0 NaN NaN 1.20 2.43 0.0 4.8 3 3.047940e+09 NaN 806.0 1000344.0 180415.0 17b NaN 310 049 31506.0 NaN NaN NaN 1 NaN NaN 22v2 NaN 40.661859 -73.941991 NaN
5 BK 4794 78 309.0 3080600.0 3.080600e+10 806.00 2000.0 17.0 41.0 11225.0 E249 71.0 32.0 4800.0 3.0 9.0 2B 545 MAPLE STREET R6 NaN NaN NaN NaN NaN NaN NaN NaN NaN N C0 2.0 0.0 NaN CHETRIT, MEIR 1400.0 3240.0 0.0 3240.0 0.0 0.0 0.0 0.0 0.0 0.0 2.0 1.0 3.0 3.0 3.0 20.0 70.0 20.00 54.0 N 2.0 N 5.0 2.0 20520.0 99000.0 0.0 2005.0 0.0 0.0 NaN NaN 2.31 2.43 0.0 4.8 3 3.047940e+09 NaN 806.0 1000192.0 180290.0 17b NaN 310 049 31506.0 NaN 3.047940e+09 04/11/2006 1 NaN NaN 22v2 NaN 40.661517 -73.942539 NaN

Part 1: Binning and Aggregation

Binning is a common strategy for visualizing large datasets. Binning is inherent to a few types of visualizations, such as histograms and 2D histograms (also check out their close relatives: 2D density plots and the more general form: heatmaps.

While these visualization types explicitly include binning, any type of visualization used with aggregated data can be looked at in the same way. For example, lets say we wanted to look at building construction over time. This would be best viewed as a line graph, but we can still think of our results as being binned by year:

In [153]:
trace = go.Scatter(
    # I'm choosing BBL here because I know it's a unique key.
    x = ny.groupby('yearbuilt').count()['bbl'].index,
    y = ny.groupby('yearbuilt').count()['bbl']
)

layout = go.Layout(
    xaxis = dict(title = 'Year Built'),
    yaxis = dict(title = 'Number of Lots Built')
)

fig = go.FigureWidget(data = [trace], layout = layout)

fig
In [154]:
ny.isna().sum()
Out[154]:
borough                      0
block                        0
lot                          0
cd                         137
bct2020                    137
bctcb2020                  137
ct2010                     137
cb2010                     137
schooldist                 191
council                    137
zipcode                    201
firecomp                   195
policeprct                 191
healthcenterdistrict       191
healtharea                 191
sanitboro                  211
sanitdistrict              211
sanitsub                   300
address                      0
zonedist1                  190
zonedist2               794616
zonedist3               811936
zonedist4               812059
overlay1                742599
overlay2                811913
spdist1                 716547
spdist2                 812010
spdist3                 812064
ltdheight               809849
splitzone                  190
bldgclass                    0
landuse                    693
easements                    0
ownertype               792477
ownername                   17
lotarea                      0
bldgarea                    10
comarea                   7041
resarea                   7041
officearea                7041
retailarea                7041
garagearea                7041
strgearea                 7041
factryarea                7041
otherarea                 7041
areasource                   0
numbldgs                     0
numfloors                 2271
unitsres                   172
unitstotal                 171
lotfront                     0
lotdepth                     0
bldgfront                    0
bldgdepth                    0
ext                      23569
proxcode                     0
irrlotcode                   0
lottype                      0
bsmtcode                     0
assessland                   0
assesstot                    0
exempttot                    0
yearbuilt                    0
yearalter1                   0
yearalter2                   0
histdist                782186
landmark                810835
builtfar                    18
residfar                     0
commfar                      0
facilfar                     0
borocode                     0
bbl                          0
condono                 802907
tract2010                  137
xcoord                     149
ycoord                     149
zonemap                    182
zmcode                  797736
sanborn                    309
taxmap                     309
edesignum               804277
appbbl                  723899
appdate                 723899
plutomapid                   0
firm07_flag             785598
pfirm15_flag            756147
version                      0
dcpedited               784741
latitude                   149
longitude                  149
notes                   812064
dtype: int64

Something looks off... You're going to have to deal with this imperfect data to answer this first question.

But first: some notes on pandas. Pandas dataframes are a different beast than R dataframes, here are some tips to help you get up to speed:


Hello all, here are some pandas tips to help you guys through this homework:

Indexing and Selecting: .loc and .iloc are the analogs for base R subsetting, or filter() in dplyr

Group By: This is the pandas analog to group_by() and the appended function the analog to summarize(). Try out a few examples of this, and display the results in Jupyter. Take note of what's happening to the indexes, you'll notice that they'll become hierarchical. I personally find this more of a burden than a help, and this sort of hierarchical indexing leads to a fundamentally different experience compared to R dataframes. Once you perform an aggregation, try running the resulting hierarchical datafrome through a reset_index().

Reset_index: I personally find the hierarchical indexes more of a burden than a help, and this sort of hierarchical indexing leads to a fundamentally different experience compared to R dataframes. reset_index() is a way of restoring a dataframe to a flatter index style. Grouping is where you'll notice it the most, but it's also useful when you filter data, and in a few other split-apply-combine workflows. With pandas indexes are more meaningful, so use this if you start getting unexpected results.

Indexes are more important in Pandas than in R. If you delve deeper into the using python for data science, you'll begin to see the benefits in many places (despite the personal gripes I highlighted above.) One place these indexes come in handy is with time series data. The pandas docs have a huge section on datetime indexing. In particular, check out resample, which provides time series specific aggregation.

Merging, joining, and concatenation: There's some overlap between these different types of merges, so use this as your guide. Concat is a single function that replaces cbind and rbind in R, and the results are driven by the indexes. Read through these examples to get a feel on how these are performed, but you will have to manage your indexes when you're using these functions. Merges are fairly similar to merges in R, similarly mapping to SQL joins.

Apply: This is explained in the "group by" section linked above. These are your analogs to the plyr library in R. Take note of the lambda syntax used here, these are anonymous functions in python. Rather than predefining a custom function, you can just define it inline using lambda.

Browse through the other sections for some other specifics, in particular reshaping and categorical data (pandas' answer to factors.) Pandas can take a while to get used to, but it is a pretty strong framework that makes more advanced functions easier once you get used to it. Rolling functions for example follow logically from the apply workflow (and led to the best google results ever when I first tried to find this out and googled "pandas rolling")

Google Wes Mckinney's book "Python for Data Analysis," which is a cookbook style intro to pandas. It's an O'Reilly book that should be pretty available out there.


Question

After a few building collapses, the City of New York is going to begin investigating older buildings for safety. The city is particularly worried about buildings that were unusually tall when they were built, since best-practices for safety hadn’t yet been determined. Create a graph that shows how many buildings of a certain number of floors were built in each year (note: you may want to use a log scale for the number of buildings). Find a strategy to bin buildings (It should be clear 20-29-story buildings, 30-39-story buildings, and 40-49-story buildings were first built in large numbers, but does it make sense to continue in this way as you get taller?)

In [155]:
ny_2 = ny.dropna(subset = ['numfloors']).copy()

ny_2[['numfloors_bin']] = (np.floor(ny_2.numfloors/10)*10).astype('int')
ny_2[['numfloors_bin']] = np.where(ny_2.numfloors_bin>=50,'50+',
                                   ny_2.numfloors_bin.astype('string').str.cat((ny_2.numfloors_bin+9).astype('string'),sep='-')
                                  )

ny_2.groupby('numfloors_bin').count()['borough']
Out[155]:
numfloors_bin
0-9      803361
10-19      4523
20-29      1093
30-39       463
40-49       225
50+         128
Name: borough, dtype: int64
In [156]:
ny_2[['yearbuilt_bin']] = (np.floor(ny_2.yearbuilt/5)*5).astype('int')
ny_2[['yearbuilt_bin']] = np.where(ny_2.yearbuilt_bin>=2015,'2015+',
                                   ny_2.yearbuilt_bin.astype('string').str.cat((ny_2.yearbuilt_bin+4).astype('string'),sep='-')
                                  )

ny_2.groupby('yearbuilt_bin').count()['borough']
Out[156]:
yearbuilt_bin
1850-1854       791
1855-1859       777
1860-1864       603
1865-1869       954
1870-1874      1423
1875-1879      1342
1880-1884      2036
1885-1889      3095
1890-1894      3476
1895-1899     22515
1900-1904     30078
1905-1909     11301
1910-1914     45313
1915-1919     17226
1920-1924     93883
1925-1929     83004
1930-1934    108273
1935-1939     28104
1940-1944     39321
1945-1949     26806
1950-1954     49436
1955-1959     30038
1960-1964     40683
1965-1969     21914
1970-1974     19848
1975-1979     12686
1980-1984      9172
1985-1989     18053
1990-1994     13381
1995-1999     15574
2000-2004     23667
2005-2009     19406
2010-2014      6893
2015+          8721
Name: borough, dtype: int64
In [157]:
stories = ny_2.groupby(['yearbuilt_bin','numfloors_bin']).count()['bbl'].reset_index()
stories.columns = ['yearbuilt_bin', 'numfloors_bin', 'count']


#Put in blank values so everything always sorts correctly
#Using NA instead of 0 so the difference between 0 and a couple is clear
for y in stories.yearbuilt_bin.unique():
    for n in stories.numfloors_bin.unique():
        if (len(stories.loc[(stories.yearbuilt_bin == y) & (stories.numfloors_bin == n),:])==0):
            stories = stories.append({'yearbuilt_bin':y,'numfloors_bin':n,'count':np.nan}, ignore_index=True)

stories['logcount'] = np.log(stories['count'])
stories.sort_values(['yearbuilt_bin','numfloors_bin'],inplace=True,ignore_index=True)

stories.head()
Out[157]:
yearbuilt_bin numfloors_bin count logcount
0 1850-1854 0-9 789.0 6.670766
1 1850-1854 10-19 2.0 0.693147
2 1850-1854 20-29 NaN NaN
3 1850-1854 30-39 NaN NaN
4 1850-1854 40-49 NaN NaN
In [158]:
fig = go.Figure()

# Loop df columns and floor bins to the figure
for nfb in stories['numfloors_bin'].unique():
    tmp_df = stories.loc[stories.numfloors_bin==nfb,:]
    fig.add_trace(go.Scatter(x=tmp_df.yearbuilt_bin, y=tmp_df.logcount,
                        mode='lines', # 'lines' or 'markers'
                        name=nfb))

fig.update_layout(
    title="Number of Buildings by Stories and Year",
    xaxis_title="Year Built",
    yaxis_title="Count of Buildings on Log Scale",
    legend_title="Number of Stories",

)
fig.show()

Although not as pretty as if the lines were all connected, I think this provides more information. It is clear there are no 10+ story buildings were built until 1880, no 20+ story buildings until 1890. So the 10-19 story buildings built from 1860-1874 seem like a concern. Also, there were no 30-40 story buildings until 1920. Yet there were 40+ story buildings built from 1905-1919. These would also be a good place to look.

Part 2: Datashader

Datashader is a library from Anaconda that does away with the need for binning data. It takes in all of your datapoints, and based on the canvas and range returns a pixel-by-pixel calculations to come up with the best representation of the data. In short, this completely eliminates the need for binning your data.

As an example, lets continue with our question above and look at a 2D histogram of YearBuilt vs NumFloors:

In [159]:
fig = go.FigureWidget(
    data = [
        go.Histogram2d(x=ny['yearbuilt'], y=ny['numfloors'], autobiny=False, ybins={'size': 1}, colorscale='Greens')
    ]
)

fig

This shows us the distribution, but it's subject to some biases discussed in the Anaconda notebook Plotting Perils.

Here is what the same plot would look like in datashader:

In [160]:
#Defining some helper functions for DataShader
background = "black"
export = partial(export_image, background = background, export_path="export")
cm = partial(colormap_select, reverse=(background!="black"))

cvs = ds.Canvas(800, 500, x_range = (ny['yearbuilt'].min(), ny['yearbuilt'].max()), 
                                y_range = (ny['numfloors'].min(), ny['numfloors'].max()))
agg = cvs.points(ny, 'yearbuilt', 'numfloors')
view = tf.shade(agg, cmap = cm(Greys9), how='log')
export(tf.spread(view, px=2), 'yearvsnumfloors')
Out[160]:

That's technically just a scatterplot, but the points are smartly placed and colored to mimic what one gets in a heatmap. Based on the pixel size, it will either display individual points, or will color the points of denser regions.

Datashader really shines when looking at geographic information. Here are the latitudes and longitudes of our dataset plotted out, giving us a map of the city colored by density of structures:

In [161]:
NewYorkCity   = (( 913164.0,  1067279.0), (120966.0, 272275.0))
cvs = ds.Canvas(700, 700, *NewYorkCity)
agg = cvs.points(ny, 'xcoord', 'ycoord')
view = tf.shade(agg, cmap = cm(inferno), how='log')
export(tf.spread(view, px=2), 'firery')
Out[161]:

Interestingly, since we're looking at structures, the large buildings of Manhattan show up as less dense on the map. The densest areas measured by number of lots would be single or multi family townhomes.

Unfortunately, Datashader doesn't have the best documentation. Browse through the examples from their github repo. I would focus on the visualization pipeline and the US Census Example for the question below. Feel free to use my samples as templates as well when you work on this problem.

Question

You work for a real estate developer and are researching underbuilt areas of the city. After looking in the Pluto data dictionary, you've discovered that all tax assessments consist of two parts: The assessment of the land and assessment of the structure. You reason that there should be a correlation between these two values: more valuable land will have more valuable structures on them (more valuable in this case refers not just to a mansion vs a bungalow, but an apartment tower vs a single family home). Deviations from the norm could represent underbuilt or overbuilt areas of the city. You also recently read a really cool blog post about bivariate choropleth maps, and think the technique could be used for this problem.

Datashader is really cool, but it's not that great at labeling your visualization. Don't worry about providing a legend, but provide a quick explanation as to which areas of the city are overbuilt, which areas are underbuilt, and which areas are built in a way that's properly correlated with their land value.

In [162]:
nys = ny.sample(frac = 0.05)
fig = px.scatter(nys, x="assesstot", y="assessland", trendline="ols")
fig.show()
In [163]:
from sklearn.metrics import r2_score
r2_score(ny.assesstot,ny.assessland)
Out[163]:
0.5493696387583773

OK, definitely follows the trend we'd expect, let's create some colors and make a map

In [164]:
land = ny[['xcoord','ycoord','assesstot','assessland']].groupby(['xcoord','ycoord']).mean().reset_index()
land
Out[164]:
xcoord ycoord assesstot assessland
0 913164.0 124024.0 95040.0 82260.0
1 913258.0 124279.0 54720.0 24240.0
2 913264.0 123753.0 74880.0 35460.0
3 913291.0 124626.0 117300.0 31020.0
4 913292.0 124196.0 69480.0 20820.0
... ... ... ... ...
811903 1067260.0 209142.0 53820.0 18840.0
811904 1067264.0 209092.0 46740.0 19260.0
811905 1067268.0 209042.0 44340.0 16500.0
811906 1067269.0 208800.0 397350.0 83250.0
811907 1067279.0 208931.0 1215900.0 543600.0

811908 rows × 4 columns

In [165]:
#Land = Yellow
l_colors = [[255,252,221],[254,236,75],[197,177,1]]

#Structure = Blue
s_colors = [[206,226,247],[81,153,225],[24,84,143]]
In [166]:
def get_color(s,l):
    new_color = list()
    for i in range(3):
        new_color.append(int(np.floor((s[i]+l[i])/2)))
    return(tuple(new_color))
In [167]:
color_map = dict()
for s in range(3):
    for l in range(3):
        color_map['s' + str(s) + 'l' + str(l)] = get_color(s_colors[s],l_colors[l])
color_map
Out[167]:
{'s0l0': (230, 239, 234),
 's0l1': (230, 231, 161),
 's0l2': (201, 201, 124),
 's1l0': (168, 202, 223),
 's1l1': (167, 194, 150),
 's1l2': (139, 165, 113),
 's2l0': (139, 168, 182),
 's2l1': (139, 160, 109),
 's2l2': (110, 130, 72)}
In [168]:
land['structure_group'] = pd.qcut(land.assesstot,[0,.3333,.6667,1], labels=['s0','s1','s2']) 
land['land_group'] = pd.qcut(land.assessland,[0,.3333,.6667,1], labels=['l0','l1','l2']) 
land['assess_group'] = land.structure_group.str.cat(land.land_group).astype('category')
land.head()
Out[168]:
xcoord ycoord assesstot assessland structure_group land_group assess_group
0 913164.0 124024.0 95040.0 82260.0 s2 l2 s2l2
1 913258.0 124279.0 54720.0 24240.0 s1 l2 s1l2
2 913264.0 123753.0 74880.0 35460.0 s1 l2 s1l2
3 913291.0 124626.0 117300.0 31020.0 s2 l2 s2l2
4 913292.0 124196.0 69480.0 20820.0 s1 l2 s1l2
In [169]:
NewYorkCity   = (( 913164.0,  1067279.0), (120966.0, 272275.0))
cvs = ds.Canvas(700, 700, *NewYorkCity)
agg = cvs.points(land, 'xcoord', 'ycoord',ds.count_cat('assess_group'))
view = tf.shade(agg, color_key=color_map)
export(tf.spread(view, px=2), 'firery')
Out[169]:

So, I will admint I've only been to NY once, and had to google a map. But from what I can see the "Blue" areas, which are areas that have building costs higher than you'd expect based on the land prices, show up in the Bronx, northern Queens, southern Brooklyn, and parts of Staten Island.

Manhattan is the most obvious area where high land and building values align as expected. Parts of Staten Island, the Bronx, and southern Queens and Brooklyn have areas where low building and land values align as expected.

A stretch through Brooklyn and Queens, as well as a parts of Staten Island have "Yellow" areas which are areas that have lower building costs than you'd expect based on land prices.